Setting up environment

Import important libraries

rawdf = read.csv("loans.csv")
df = rawdf
df = df[c(-1)]
str(df)
## 'data.frame':    9578 obs. of  14 variables:
##  $ credit.policy    : chr  "1" "1" "1" "1" ...
##  $ purpose          : chr  "debt_consolidation" "credit_card" "debt_consolidation" "debt_consolidation" ...
##  $ int.rate         : num  0.119 0.107 0.136 0.101 0.143 ...
##  $ installment      : num  829 228 367 162 103 ...
##  $ log.annual.inc   : num  11.4 11.1 10.4 11.4 11.3 ...
##  $ dti              : chr  "19.48" "14.29" "11.63" "8.1" ...
##  $ fico             : int  737 707 682 712 667 727 667 722 682 707 ...
##  $ days.with.cr.line: num  5640 2760 4710 2700 4066 ...
##  $ revol.bal        : int  28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
##  $ revol.util       : chr  "52.1" "76.7" "25.6" "73.2" ...
##  $ inq.last.6mths   : int  0 0 1 1 0 0 0 0 1 1 ...
##  $ delinq.2yrs      : chr  "0" "0" "0" "0" ...
##  $ pub.rec          : chr  "0" "0" "0" "0" ...
##  $ not.fully.paid   : int  0 0 0 0 0 0 1 1 0 0 ...
head(df)
##   credit.policy            purpose int.rate installment log.annual.inc   dti
## 1             1 debt_consolidation   0.1189      829.10       11.35041 19.48
## 2             1        credit_card   0.1071      228.22       11.08214 14.29
## 3             1 debt_consolidation   0.1357      366.86       10.37349 11.63
## 4             1 debt_consolidation   0.1008      162.34       11.35041   8.1
## 5             1        credit_card   0.1426      102.92       11.29973 14.97
## 6             1        credit_card   0.0788      125.13       11.90497 16.98
##   fico days.with.cr.line revol.bal revol.util inq.last.6mths delinq.2yrs
## 1  737          5639.958     28854       52.1              0           0
## 2  707          2760.000     33623       76.7              0           0
## 3  682          4710.000      3511       25.6              1           0
## 4  712          2699.958     33667       73.2              1           0
## 5  667          4066.000      4740       39.5              0           1
## 6  727          6120.042     50807         51              0           0
##   pub.rec not.fully.paid
## 1       0              0
## 2       0              0
## 3       0              0
## 4       0              0
## 5       0              0
## 6       0              0

Cleaning the data

Change char type to num

cols_int = c(1,12,13)
df[cols_int] = sapply(df[cols_int],as.integer)
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
cols_num = c(6,10)
df[cols_num] = sapply(df[cols_num],as.numeric)
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

change purpose to factor type

df$purpose <-factor(df$purpose)
str(df)
## 'data.frame':    9578 obs. of  14 variables:
##  $ credit.policy    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ purpose          : Factor w/ 7 levels "all_other","credit_card",..: 3 2 3 3 2 2 3 1 5 3 ...
##  $ int.rate         : num  0.119 0.107 0.136 0.101 0.143 ...
##  $ installment      : num  829 228 367 162 103 ...
##  $ log.annual.inc   : num  11.4 11.1 10.4 11.4 11.3 ...
##  $ dti              : num  19.5 14.3 11.6 8.1 15 ...
##  $ fico             : int  737 707 682 712 667 727 667 722 682 707 ...
##  $ days.with.cr.line: num  5640 2760 4710 2700 4066 ...
##  $ revol.bal        : int  28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
##  $ revol.util       : num  52.1 76.7 25.6 73.2 39.5 51 76.8 68.6 51.1 23 ...
##  $ inq.last.6mths   : int  0 0 1 1 0 0 0 0 1 1 ...
##  $ delinq.2yrs      : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ pub.rec          : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ not.fully.paid   : int  0 0 0 0 0 0 1 1 0 0 ...
sum(is.na(df))
## [1] 191
(sum(is.na(df))/nrow(df)) * 100
## [1] 1.994153

number of missing data and percentage

We can delete all the rows which has the missing value because it’s only 1.9% pf the dataset

df = na.omit(df)
nrow(df)
## [1] 9508
sum(is.na(df))
## [1] 0

Delete the row with missing values and we still have 9508 rows

summary(df)
##  credit.policy                  purpose        int.rate        installment    
##  Min.   :0.0000   all_other         :2288   Min.   : 0.0600   Min.   : 15.67  
##  1st Qu.:1.0000   credit_card       :1259   1st Qu.: 0.1039   1st Qu.:164.02  
##  Median :1.0000   debt_consolidation:3944   Median : 0.1221   Median :269.55  
##  Mean   :0.8076   educational       : 340   Mean   : 0.1256   Mean   :320.15  
##  3rd Qu.:1.0000   home_improvement  : 627   3rd Qu.: 0.1407   3rd Qu.:435.23  
##  Max.   :1.0000   major_purchase    : 432   Max.   :14.7000   Max.   :940.14  
##                   small_business    : 618                                     
##  log.annual.inc        dti              fico        days.with.cr.line
##  Min.   : 7.548   Min.   : 0.000   Min.   : 612.0   Min.   :  180    
##  1st Qu.:10.565   1st Qu.: 7.228   1st Qu.: 682.0   1st Qu.: 2820    
##  Median :10.933   Median :12.700   Median : 707.0   Median : 4140    
##  Mean   :10.934   Mean   :12.630   Mean   : 711.2   Mean   : 4566    
##  3rd Qu.:11.293   3rd Qu.:18.000   3rd Qu.: 737.0   3rd Qu.: 5730    
##  Max.   :14.528   Max.   :29.960   Max.   :1812.0   Max.   :17640    
##                                                                      
##    revol.bal         revol.util     inq.last.6mths    delinq.2yrs     
##  Min.   :      0   Min.   :  0.00   Min.   : 0.000   Min.   : 0.0000  
##  1st Qu.:   3273   1st Qu.: 22.70   1st Qu.: 0.000   1st Qu.: 0.0000  
##  Median :   8690   Median : 46.40   Median : 1.000   Median : 0.0000  
##  Mean   :  16998   Mean   : 46.94   Mean   : 1.573   Mean   : 0.1637  
##  3rd Qu.:  18375   3rd Qu.: 71.00   3rd Qu.: 2.000   3rd Qu.: 0.0000  
##  Max.   :1207359   Max.   :670.00   Max.   :33.000   Max.   :13.0000  
##                                                                       
##     pub.rec        not.fully.paid  
##  Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000  
##  Mean   :0.06216   Mean   :0.1597  
##  3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :5.00000   Max.   :1.0000  
## 

Find the correlation of the data to fico

cor_matrix = cor(df[-c(2)], method = "spearman")
names = rownames(cor_matrix)
abs_cor = abs(cor_matrix)
data = data.frame(X_var = names,abs_cor = abs_cor,cor = cor_matrix)

cortmp = data[order(data$abs_cor.fico)]
cortmp['abs_cor.fico']
##                   abs_cor.fico
## credit.policy       0.35843775
## int.rate            0.74680656
## installment         0.08555879
## log.annual.inc      0.10600806
## dti                 0.21387901
## fico                1.00000000
## days.with.cr.line   0.25133508
## revol.bal           0.09472345
## revol.util          0.51904804
## inq.last.6mths      0.17666043
## delinq.2yrs         0.23756086
## pub.rec             0.14881785
## not.fully.paid      0.14706919

We could know that ‘int.rate’,‘revol.util’, and ‘credit.policy’ has correlation with ‘fico’

cor_matrix = cor(df[-c(2)], method = "spearman",)
cor_matrix = round(cor_matrix, 2)
cor_matrix
##                   credit.policy int.rate installment log.annual.inc   dti  fico
## credit.policy              1.00    -0.30        0.07           0.03 -0.09  0.36
## int.rate                  -0.30     1.00        0.24           0.04  0.22 -0.75
## installment                0.07     0.24        1.00           0.43  0.06  0.09
## log.annual.inc             0.03     0.04        0.43           1.00 -0.06  0.11
## dti                       -0.09     0.22        0.06          -0.06  1.00 -0.21
## fico                       0.36    -0.75        0.09           0.11 -0.21  1.00
## days.with.cr.line          0.11    -0.13        0.20           0.40  0.07  0.25
## revol.bal                 -0.02     0.15        0.35           0.42  0.37 -0.09
## revol.util                -0.11     0.47        0.09           0.05  0.33 -0.52
## inq.last.6mths            -0.43     0.18        0.00           0.03  0.03 -0.18
## delinq.2yrs               -0.06     0.17       -0.01           0.03 -0.02 -0.24
## pub.rec                   -0.05     0.10       -0.03           0.01  0.01 -0.15
## not.fully.paid            -0.16     0.15        0.04          -0.03  0.04 -0.15
##                   days.with.cr.line revol.bal revol.util inq.last.6mths
## credit.policy                  0.11     -0.02      -0.11          -0.43
## int.rate                      -0.13      0.15       0.47           0.18
## installment                    0.20      0.35       0.09           0.00
## log.annual.inc                 0.40      0.42       0.05           0.03
## dti                            0.07      0.37       0.33           0.03
## fico                           0.25     -0.09      -0.52          -0.18
## days.with.cr.line              1.00      0.32       0.00          -0.04
## revol.bal                      0.32      1.00       0.52          -0.02
## revol.util                     0.00      0.52       1.00          -0.01
## inq.last.6mths                -0.04     -0.02      -0.01           1.00
## delinq.2yrs                    0.09     -0.06      -0.03           0.02
## pub.rec                        0.10     -0.03       0.07           0.06
## not.fully.paid                -0.03      0.02       0.08           0.13
##                   delinq.2yrs pub.rec not.fully.paid
## credit.policy           -0.06   -0.05          -0.16
## int.rate                 0.17    0.10           0.15
## installment             -0.01   -0.03           0.04
## log.annual.inc           0.03    0.01          -0.03
## dti                     -0.02    0.01           0.04
## fico                    -0.24   -0.15          -0.15
## days.with.cr.line        0.09    0.10          -0.03
## revol.bal               -0.06   -0.03           0.02
## revol.util              -0.03    0.07           0.08
## inq.last.6mths           0.02    0.06           0.13
## delinq.2yrs              1.00    0.00           0.01
## pub.rec                  0.00    1.00           0.06
## not.fully.paid           0.01    0.06           1.00
corrplot(cor_matrix,method='number')

## Check for the outliers

bar <- ggplot(df,aes(fico))+geom_histogram(aes(fill=factor(not.fully.paid)),color='black',bins = 40,alpha=0.5)
bar+scale_fill_manual(values = c("#FF5733","#44FF33"))+theme_bw()

boxplot(fico~purpose,data=df,col='orange')

ggplot(df) +
  aes(x = fico, y = int.rate) +
  geom_point(shape = "circle", size = 1.5, colour = "#228B22")

We have outliers, we need to handle with which int.rate > 10 and fico > 1500

df = df[-which(df$fico > 850),]
df = df[-which(df$int.rate> 5),]
df = df[-which(df$revol.util > 500),]
df = df[-which(df$revol.bal > 75000),]
ggplot(df) +
  aes(x = fico, y = int.rate) +
  geom_point(shape = "circle", size = 1.5, colour = "#228B22")

ggplot(df) +
  aes(x = revol.bal, y = revol.util) +
  geom_point(shape = "circle", size = 1.5, colour = "#228B22")

ggplot(df) +
  aes(x = fico, y = inq.last.6mths) +
  geom_point(shape = "circle", size = 1.5, colour = "#228B22")

ggplot(df) +
  aes(x = fico, y = delinq.2yrs) +
  geom_point(shape = "circle", size = 1.5, colour = "#228B22")

bar<- ggplot(df,aes(factor(purpose)))+geom_bar(aes(fill=factor(not.fully.paid)),position='dodge')
bar+theme(axis.text.x =element_text(angle = 90,size = 10,vjust = 0.5))+theme_bw()

box <- ggplot(df,aes(fico))+geom_histogram(aes(fill=factor(not.fully.paid)),color='black',bins = 40,alpha=0.5)
box+scale_fill_manual(values = c("#FF5733","#44FF33"))+theme_bw()

boxplot(fico~purpose,data=df,col='orange')

and now we delete the outliers of variable int.rate, revol.util and fico

Creating A Model

Linear Model

Split for linear regression by using split ratio at 0.7

set.seed(99)
split = sample.split (df$fico, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)

3

Building Linear Model

linearMod = lm(fico ~ .,data = df_train)
summary(linearMod)
## 
## Call:
## lm(formula = fico ~ ., data = df_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -94.37 -13.33  -1.35  11.99 109.71 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                8.075e+02  5.801e+00 139.207  < 2e-16 ***
## credit.policy              1.022e+01  8.483e-01  12.052  < 2e-16 ***
## purposecredit_card        -3.065e+00  8.973e-01  -3.415 0.000642 ***
## purposedebt_consolidation -3.540e+00  6.799e-01  -5.207 1.98e-07 ***
## purposeeducational        -3.203e+00  1.449e+00  -2.210 0.027124 *  
## purposehome_improvement    2.021e+00  1.117e+00   1.810 0.070417 .  
## purposemajor_purchase      5.146e-01  1.281e+00   0.402 0.688006    
## purposesmall_business      1.354e+01  1.188e+00  11.395  < 2e-16 ***
## int.rate                  -8.491e+02  1.274e+01 -66.655  < 2e-16 ***
## installment                4.330e-02  1.535e-03  28.213  < 2e-16 ***
## log.annual.inc            -4.301e-01  5.322e-01  -0.808 0.419058    
## dti                       -1.747e-01  4.153e-02  -4.206 2.63e-05 ***
## days.with.cr.line          2.123e-03  1.139e-04  18.637  < 2e-16 ***
## revol.bal                 -2.431e-05  2.371e-05  -1.026 0.305093    
## revol.util                -2.989e-01  1.150e-02 -25.997  < 2e-16 ***
## inq.last.6mths             1.395e-01  1.441e-01   0.968 0.333113    
## delinq.2yrs               -9.043e+00  4.632e-01 -19.522  < 2e-16 ***
## pub.rec                   -9.746e+00  9.821e-01  -9.924  < 2e-16 ***
## not.fully.paid            -3.436e+00  7.137e-01  -4.815 1.51e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.25 on 6425 degrees of freedom
## Multiple R-squared:  0.7158, Adjusted R-squared:  0.715 
## F-statistic: 899.1 on 18 and 6425 DF,  p-value: < 2.2e-16
df_test$FicoLinear = predict(linearMod,df_test)
LinearFiRM = rmse(df_test$fico, df_test$FicoLinear)
LinearFiRM
## [1] 20.6761

Now we have rmse value at 20.871 by using linear model

linearMod = lm(fico ~ int.rate * purpose+ int.rate*installment + dti*days.with.cr.line +delinq.2yrs*pub.rec*not.fully.paid,data = df_train)
summary(linearMod)
## 
## Call:
## lm(formula = fico ~ int.rate * purpose + int.rate * installment + 
##     dti * days.with.cr.line + delinq.2yrs * pub.rec * not.fully.paid, 
##     data = df_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -118.267  -13.196   -1.576   11.802   98.017 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         8.172e+02  3.083e+00 265.061  < 2e-16 ***
## int.rate                           -1.035e+03  2.393e+01 -43.236  < 2e-16 ***
## purposecredit_card                  2.411e+00  4.447e+00   0.542 0.587799    
## purposedebt_consolidation          -8.452e+00  3.287e+00  -2.571 0.010155 *  
## purposeeducational                 -2.736e+00  7.238e+00  -0.378 0.705415    
## purposehome_improvement             1.009e+01  5.143e+00   1.962 0.049862 *  
## purposemajor_purchase               2.678e+00  5.761e+00   0.465 0.641999    
## purposesmall_business              -4.860e+01  5.658e+00  -8.590  < 2e-16 ***
## installment                         7.216e-02  6.877e-03  10.493  < 2e-16 ***
## dti                                 2.922e-02  8.044e-02   0.363 0.716403    
## days.with.cr.line                   3.158e-03  2.187e-04  14.439  < 2e-16 ***
## delinq.2yrs                        -7.781e+00  5.420e-01 -14.356  < 2e-16 ***
## pub.rec                            -1.138e+01  1.198e+00  -9.498  < 2e-16 ***
## not.fully.paid                     -4.470e+00  8.205e-01  -5.448 5.29e-08 ***
## int.rate:purposecredit_card        -7.579e+01  3.676e+01  -2.062 0.039257 *  
## int.rate:purposedebt_consolidation  2.082e+01  2.658e+01   0.783 0.433609    
## int.rate:purposeeducational         4.973e+00  6.001e+01   0.083 0.933963    
## int.rate:purposehome_improvement   -6.012e+01  4.235e+01  -1.419 0.155819    
## int.rate:purposemajor_purchase     -6.813e+00  4.925e+01  -0.138 0.889968    
## int.rate:purposesmall_business      4.783e+02  4.142e+01  11.546  < 2e-16 ***
## int.rate:installment               -1.730e-01  5.145e-02  -3.362 0.000778 ***
## dti:days.with.cr.line              -9.891e-05  1.559e-05  -6.344 2.39e-10 ***
## delinq.2yrs:pub.rec                 6.188e+00  1.559e+00   3.969 7.28e-05 ***
## delinq.2yrs:not.fully.paid         -1.494e+00  1.493e+00  -1.001 0.317014    
## pub.rec:not.fully.paid              2.317e+00  2.563e+00   0.904 0.366056    
## delinq.2yrs:pub.rec:not.fully.paid -3.100e+00  4.505e+00  -0.688 0.491385    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.33 on 6418 degrees of freedom
## Multiple R-squared:  0.685,  Adjusted R-squared:  0.6838 
## F-statistic: 558.3 on 25 and 6418 DF,  p-value: < 2.2e-16
df_test$FicoLinear = predict(linearMod,df_test)
LinearFiRM = rmse(df_test$fico, df_test$FicoLinear)
LinearFiRM
## [1] 22.19836

Logistic Regression

split at the rate of 0.7

set.seed(99)
split = sample.split (df$not.fully.paid, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)

4

For Baseline Accuracy

t = table(df_test$not.fully.paid)
t
## 
##    0    1 
## 2329  432
accuracy = t[1]/sum(t)
cat("The accuracy is", round(accuracy,2))
## The accuracy is 0.84

Baseline accuracy is at 84%

5

LogRegModel = glm(not.fully.paid ~., family = binomial, df_train) 
summary(LogRegModel)
## 
## Call:
## glm(formula = not.fully.paid ~ ., family = binomial, data = df_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6005  -0.6169  -0.4956  -0.3656   2.5350  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                8.401e+00  1.610e+00   5.218 1.81e-07 ***
## credit.policy             -3.753e-01  1.041e-01  -3.606 0.000311 ***
## purposecredit_card        -6.647e-01  1.387e-01  -4.792 1.65e-06 ***
## purposedebt_consolidation -3.892e-01  9.405e-02  -4.138 3.50e-05 ***
## purposeeducational         4.720e-02  1.857e-01   0.254 0.799368    
## purposehome_improvement    5.299e-02  1.557e-01   0.340 0.733632    
## purposemajor_purchase     -4.685e-01  2.061e-01  -2.274 0.022991 *  
## purposesmall_business      4.845e-01  1.437e-01   3.372 0.000747 ***
## int.rate                   6.223e-01  2.129e+00   0.292 0.770074    
## installment                1.420e-03  2.193e-04   6.476 9.42e-11 ***
## log.annual.inc            -3.669e-01  7.705e-02  -4.762 1.92e-06 ***
## dti                       -2.530e-03  5.738e-03  -0.441 0.659310    
## fico                      -9.208e-03  1.741e-03  -5.290 1.22e-07 ***
## days.with.cr.line          1.846e-05  1.640e-05   1.126 0.260087    
## revol.bal                  1.470e-06  3.338e-06   0.440 0.659640    
## revol.util                 3.810e-03  1.626e-03   2.343 0.019117 *  
## inq.last.6mths             6.623e-02  1.611e-02   4.111 3.94e-05 ***
## delinq.2yrs               -8.960e-02  6.690e-02  -1.339 0.180500    
## pub.rec                    8.764e-02  1.244e-01   0.705 0.481022    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5592.1  on 6441  degrees of freedom
## Residual deviance: 5238.3  on 6423  degrees of freedom
## AIC: 5276.3
## 
## Number of Fisher Scoring iterations: 5
df_test$PredictedRisk = predict(LogRegModel, type = "response", df_test)
LogRegPredict = predict(LogRegModel, type = "response", df_test)
plot(df_test$PredictedRisk)

df_test$PredictedRisk_Cat = ifelse(df_test$PredictedRisk > 0.25,1,0)
t = table(df_test$not.fully.paid, df_test$PredictedRisk_Cat)

Accuracy Test

t
##    
##        0    1
##   0 2090  239
##   1  309  123
accuracy = sum(diag(t)/sum(t))
cat("The accuracy is", round(accuracy,2))
## The accuracy is 0.8
ROCRpred = prediction (LogRegPredict, df_test$not.fully.paid)
ROCRperf = performance (ROCRpred, "tpr", "fpr")
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.01), text.adj = c(-0.2, 1.7))
abline(v=0.25)

AUCVal6 = as.numeric (performance (ROCRpred, "auc") @y.values) 

cat("at threshold = 0.25 AUC Value is :", AUCVal6)
## at threshold = 0.25 AUC Value is : 0.6817671

AUC Value around 0.68 at threshold = 0.25

7

What is the best threshold value to maximize true positive rate while keeping false positive at max 25% (or 0.25)?

LogRegModel = glm(not.fully.paid ~ ., family = binomial, df_train) 
summary(LogRegModel)
## 
## Call:
## glm(formula = not.fully.paid ~ ., family = binomial, data = df_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6005  -0.6169  -0.4956  -0.3656   2.5350  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                8.401e+00  1.610e+00   5.218 1.81e-07 ***
## credit.policy             -3.753e-01  1.041e-01  -3.606 0.000311 ***
## purposecredit_card        -6.647e-01  1.387e-01  -4.792 1.65e-06 ***
## purposedebt_consolidation -3.892e-01  9.405e-02  -4.138 3.50e-05 ***
## purposeeducational         4.720e-02  1.857e-01   0.254 0.799368    
## purposehome_improvement    5.299e-02  1.557e-01   0.340 0.733632    
## purposemajor_purchase     -4.685e-01  2.061e-01  -2.274 0.022991 *  
## purposesmall_business      4.845e-01  1.437e-01   3.372 0.000747 ***
## int.rate                   6.223e-01  2.129e+00   0.292 0.770074    
## installment                1.420e-03  2.193e-04   6.476 9.42e-11 ***
## log.annual.inc            -3.669e-01  7.705e-02  -4.762 1.92e-06 ***
## dti                       -2.530e-03  5.738e-03  -0.441 0.659310    
## fico                      -9.208e-03  1.741e-03  -5.290 1.22e-07 ***
## days.with.cr.line          1.846e-05  1.640e-05   1.126 0.260087    
## revol.bal                  1.470e-06  3.338e-06   0.440 0.659640    
## revol.util                 3.810e-03  1.626e-03   2.343 0.019117 *  
## inq.last.6mths             6.623e-02  1.611e-02   4.111 3.94e-05 ***
## delinq.2yrs               -8.960e-02  6.690e-02  -1.339 0.180500    
## pub.rec                    8.764e-02  1.244e-01   0.705 0.481022    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5592.1  on 6441  degrees of freedom
## Residual deviance: 5238.3  on 6423  degrees of freedom
## AIC: 5276.3
## 
## Number of Fisher Scoring iterations: 5
df_test$PredictedRisk = predict(LogRegModel, type = "response", df_test)
LogRegPredict = predict(LogRegModel, type = "response", df_test)

plot(df_test$PredictedRisk)

df_test$PredictedRisk_Cat = ifelse(df_test$PredictedRisk > 0.185,1,0)
t = table(df_test$not.fully.paid, df_test$PredictedRisk_Cat)

t
##    
##        0    1
##   0 1735  594
##   1  218  214
accuracy = sum(diag(t)/sum(t))
cat("The accuracy is", round(accuracy,2))
## The accuracy is 0.71

Best threshold if we want to keep false potive at 0.25 is 0.185

ROCRpred = prediction (LogRegPredict, df_test$not.fully.paid)
ROCRperf = performance (ROCRpred, "tpr", "fpr")
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.1), text.adj = c(-0.2, 1.7))

as.numeric (performance (ROCRpred, "auc") @y.values) 
## [1] 0.6817671

8 Simpler model

Using only int.rate to create Regression model

LogRegModel = glm(not.fully.paid ~ int.rate , family = binomial, df_train) 
summary(LogRegModel)
## 
## Call:
## glm(formula = not.fully.paid ~ int.rate, family = binomial, data = df_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0532  -0.6209  -0.5356  -0.4298   2.3052  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -3.722      0.173  -21.52   <2e-16 ***
## int.rate      16.135      1.303   12.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5592.1  on 6441  degrees of freedom
## Residual deviance: 5434.2  on 6440  degrees of freedom
## AIC: 5438.2
## 
## Number of Fisher Scoring iterations: 4
df_test$PredictedRisk = predict(LogRegModel, type = "response", df_test)
LogRegPredict = predict(LogRegModel, type = "response", df_test)


plot(df_test$PredictedRisk)

df_test$PredictedRisk_Cat = ifelse(df_test$PredictedRisk > 0.20,1,0)
t = table(df_test$not.fully.paid, df_test$PredictedRisk_Cat)

Accuracy of my simple model

t
##    
##        0    1
##   0 1907  422
##   1  305  127
accuracy = sum(diag(t)/sum(t))
cat("The accuracy is", round(accuracy,2))
## The accuracy is 0.74
ROCRpred = prediction (LogRegPredict, df_test$not.fully.paid)
ROCRperf = performance (ROCRpred, "tpr", "fpr")
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.05), text.adj = c(-0.2, 1.7))

AUCVal = as.numeric (performance (ROCRpred, "auc") @y.values) 

cat("The AUC Value of my model is", round(AUCVal,3))
## The AUC Value of my model is 0.622

9

investment = df_test
investment$profit = (1 + investment$int.rate) ^ 3 - 1
investment$profit[investment$PredictedRisk_Cat == 1] = -1
summary(investment$profit)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.0000  0.2510  0.3728  0.1073  0.4250  0.4984
sum(investment$profit)
## [1] 296.1645
head(investment)
##    credit.policy            purpose int.rate installment log.annual.inc   dti
## 4              1 debt_consolidation   0.1008      162.34       11.35041  8.10
## 6              1        credit_card   0.0788      125.13       11.90497 16.98
## 25             1 debt_consolidation   0.1229      320.19       11.26446  8.80
## 27             1          all_other   0.0743      155.38       11.08214  0.28
## 31             1 debt_consolidation   0.0807      156.84       11.51293  2.30
## 41             1   home_improvement   0.0807      156.84       12.10071  5.55
##    fico days.with.cr.line revol.bal revol.util inq.last.6mths delinq.2yrs
## 4   712          2699.958     33667       73.2              1           0
## 6   727          6120.042     50807       51.0              0           0
## 25  672          3760.958      4822       58.1              0           0
## 27  802          4649.958      1576        5.7              1           0
## 31  742          3148.958      9698       19.4              0           0
## 41  742          4019.000     40934       26.3              0           0
##    pub.rec not.fully.paid PredictedRisk PredictedRisk_Cat    profit
## 4        0              0    0.10955325                 0 0.3339061
## 6        0              0    0.07941841                 0 0.2555176
## 25       1              0    0.14947352                 0 0.4158696
## 27       0              0    0.07426959                 0 0.2398716
## 31       0              0    0.08168879                 0 0.2621630
## 41       0              0    0.08168879                 0 0.2621630

10

What is the average profit of a $1 investment in one of these high-interest loans? What proportion of the high-interest loans were not paid back in full?

HighInterest = df_test
HighInterest = HighInterest[which(HighInterest$int.rate >0.15),]
HighInterest$profit = (1 + HighInterest$int.rate) ^ 3 - 1
HighInterest$profit[HighInterest$PredictedRisk_Cat ==1] = -1

summary(HighInterest$profit)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      -1      -1      -1      -1      -1      -1
HighIntProfit = sum(HighInterest$profit)

proportion = mean(HighInterest$PredictedRisk_Cat)
proportion
## [1] 1

10.1 Every 1 dollar you invest you will lose 8.88 cent ( or 0.08 dollars )by use the mean of profit variable 10.2 Proportion of the high-interest loans were not paid back in full is 0.69 or 69%

11

What is the profit to an investor who invested $1 in each of these 100 loans? How does this compare to investing in all loans? How does this compare to investing in all loans?

SelectedLoans = sqldf("select *
                      from HighInterest
                      order by PredictedRisk")

SelectedLoans = head(SelectedLoans,100)
SelectedLProfit = sum(SelectedLoans$profit)
SelectedLProfit - HighIntProfit
## [1] 308

11.1 Profit of the investor who invested 1 dollars in each of the loans will receive profit around 52 dollars 11.2 Compare to investing in all loans will receive more around 89 dollars

12

Decision Tree

set.seed(99)
split = sample.split (df$fico, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)

Decision Tree with Regression predict fico

prpDCTR = rpart(fico ~ ., df_train,control = rpart.control(cp= 0.004))
PredictFico = predict(prpDCTR, df_test, method = "anova")
plot(df_test$fico, PredictFico)

DeciTrRM=sqrt(mean((df_test$fico - PredictFico)^2))
rpart.plot(prpDCTR)

rpart.rules(prpDCTR, cover=TRUE)
##  fico                                                                                                                                                                                                                                                                   cover
##   660 when int.rate >=          0.132                                         & purpose is all_other or credit_card or debt_consolidation or educational or home_improvement or major_purchase & credit.policy is 0 & inq.last.6mths <  4                                  5%
##   676 when int.rate >=          0.132                                         & purpose is all_other or credit_card or debt_consolidation or educational or home_improvement or major_purchase & credit.policy is 0 & inq.last.6mths >= 4                                  6%
##   681 when int.rate >=          0.132                                         & purpose is all_other or credit_card or debt_consolidation or educational or home_improvement or major_purchase & credit.policy is 1 & inq.last.6mths <  4                                 20%
##   693 when int.rate is 0.122 to 0.132                    & installment <  401                                                                                                                                                                                             11%
##   696 when int.rate >=          0.132 & revol.util >= 14                      & purpose is                                                                                      small_business                                                                             3%
##   707 when int.rate is 0.096 to 0.122 & revol.util >= 25 & installment <  414                                                                                                                                                                                             16%
##   713 when int.rate is 0.122 to 0.132                    & installment >= 401                                                                                                                                                                                              4%
##   721 when int.rate is 0.096 to 0.122 & revol.util <  25 & installment <  414                                                                                                                                                                                              6%
##   734 when int.rate is 0.096 to 0.122 & revol.util >= 11 & installment >= 414                                                                                                                                                                                              7%
##   738 when int.rate >=          0.132 & revol.util <  14                      & purpose is                                                                                      small_business                                                                             1%
##   747 when int.rate <  0.096          & revol.util >= 17                                                                                                                                                                                                                  12%
##   753 when int.rate <  0.096          & revol.util <  17                                                                                                                                                                                  & days.with.cr.line <  3810      3%
##   754 when int.rate >=          0.132                                         & purpose is all_other or credit_card or debt_consolidation or educational or home_improvement or major_purchase & credit.policy is 1 & inq.last.6mths >= 4                                  0%
##   765 when int.rate is 0.096 to 0.122 & revol.util <  11 & installment >= 414                                                                                                                                                                                              2%
##   778 when int.rate <  0.096          & revol.util <  17                                                                                                                                                                                  & days.with.cr.line >= 3810      6%
plotcp(prpDCTR)

DeciTrRM
## [1] 22.63712
LinearFiRM
## [1] 22.19836
cat("Different of RMSE is Decision Tree Model RMSE - Linear Model RMSE: ",DeciTrRM - LinearFiRM)
## Different of RMSE is Decision Tree Model RMSE - Linear Model RMSE:  0.4387531

RMSE of Decision Tree is at 22.63 for Linear Regression is 20.67 It means that Linear Regression slightly better because RMSE is lower than 1.96

Decision Tree predict not.fully.paid

set.seed(99)
split = sample.split (df$not.fully.paid, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)
nfpDCTM = rpart(not.fully.paid~.,data=df_train, method = 'class',cp=0.002)
prp(nfpDCTM)

rpart.plot(nfpDCTM)

printcp(nfpDCTM)
## 
## Classification tree:
## rpart(formula = not.fully.paid ~ ., data = df_train, method = "class", 
##     cp = 0.002)
## 
## Variables actually used in tree construction:
##  [1] credit.policy     days.with.cr.line delinq.2yrs       dti              
##  [5] inq.last.6mths    installment       int.rate          log.annual.inc   
##  [9] purpose           revol.util       
## 
## Root node error: 1009/6442 = 0.15663
## 
## n= 6442 
## 
##          CP nsplit rel error xerror     xstd
## 1 0.0023125      0   1.00000 1.0000 0.028911
## 2 0.0020000     20   0.93756 1.0575 0.029571
plotcp(nfpDCTM)

{r} # nfpPred = predict(nfpDCTM, newdata=df_test,type='class') # table = table(nfpPred, df_test$not.fully.paid) # table # accuracy = sum(diag(table))/(sum(table)) # cat("The accuracy is", round(accuracy,3)) #

nfpPred = predict(nfpDCTM, newdata=df_test,type='prob')
nfpPred_Cat = ifelse(nfpPred[, 2] > 0.2, 1, 0)

table = table(nfpPred_Cat, df_test$not.fully.paid)
table
##            
## nfpPred_Cat    0    1
##           0 2017  297
##           1  312  135
accuracy = sum(diag(table))/(sum(table))
cat("The accuracy is", round(accuracy,2))
## The accuracy is 0.78
ROCRpred = prediction (nfpPred[, 2], df_test$not.fully.paid)
AUCVal12 = as.numeric (performance (ROCRpred, "auc") @y.values) # higher auc value is better
performance (ROCRpred, "tpr", "fpr")
## A performance instance
##   'False positive rate' vs. 'True positive rate' (alpha: 'Cutoff')
##   with 20 data points
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.1), text.adj = c(-0.2, 1.7))

AUC of Logistic Regression is higher than AUC of task 12 by 0.09

AUCVal6
## [1] 0.6817671
AUCVal12
## [1] 0.5849698
AUCVal6 - AUCVal12
## [1] 0.09679733

Random Forest

Random Forest with Regression to predict fico

set.seed(99)
split = sample.split (df$fico, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)
FicoForest = randomForest(fico~.,df_train, ntree=700, mtry=2)
PredictForest = predict(FicoForest, df_test)
RanForRM = round(sqrt(mean((df_test$fico - PredictForest)^2)),2)
cat("rmse is:", RanForRM)
## rmse is: 18.11
RanForRM
## [1] 18.11
LinearFiRM
## [1] 22.19836
cat("Different of RMSE is Random Forest Model RMSE - Linear Model RMSE: ",RanForRM - LinearFiRM)
## Different of RMSE is Random Forest Model RMSE - Linear Model RMSE:  -4.088365

Random Forest with Classification to predict not.fully.paid

set.seed(99)
split = sample.split (df$not.fully.paid, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)
NFPForest = randomForest(not.fully.paid ~ ., data = df_train, mtry = 3, ntree = 50)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
PredictNFP = predict(NFPForest, df_test, type = "class")
df_test$RandomForest = predict(NFPForest, df_test, type = "class")
plot(PredictNFP)

df_test$PredictedRisk = ifelse(df_test$RandomForest > 0.25,1,0)
table(df_test$not.fully.paid, df_test$PredictedRisk)
##    
##        0    1
##   0 1939  390
##   1  284  148
ROCRpred = prediction (PredictNFP, df_test$not.fully.paid)
ROCRperf = performance (ROCRpred, "tpr", "fpr")
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.1), text.adj = c(-0.2, 1.7))

AUCVal = as.numeric (performance (ROCRpred, "auc") @y.values) 

cat("The AUC Value of my model is", round(AUCVal,3))
## The AUC Value of my model is 0.67

13

Clustering

Preparing Data for clustering

df = rawdf
df = na.omit(df)
df = df[-which(df$fico > 850 ),]
df = df[-which(df$int.rate> 5),]
df = df[-which(df$revol.util > 500),]
df = df[-which(df$revol.bal > 75000),]
df = df[-c(1)]

cols_num = c(1,12,13,6,10)
df[cols_num] = sapply(df[cols_num],as.numeric)
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
df_cluster = df
df_cluster = fastDummies::dummy_cols(df_cluster, select_columns = "purpose")
df_cluster = df_cluster[-c(2)]

Scale and build cluster

df_cluster = df_cluster[c(1,2,6,7,9)]
df_cluster = na.omit(df_cluster)
df_scale = scale(df_cluster)
km.out = kmeans(df_scale, 3 ,nstart=20)
N = 10
information = rep ( NA, N )
for ( i in 1: N ){
  KM = kmeans ( df_scale, centers = i, iter.max = 35, nstart = 10 )
  information [ i ] = KM$tot.withinss
}

plot ( information ~ seq ( 1:N ), type = "b", pch = 1, col = 2, ylab = "Total within Sum of Squares", lwd=2,
       xlab = "Number of Clusters", main = "Selecting K by elbow method" )

plot(df_cluster, col = km.out$cluster)

cluster1 = subset(df_cluster, km.out$cluster == 1)
cluster2 = subset(df_cluster, km.out$cluster == 2)
cluster3 = subset(df_cluster, km.out$cluster == 3)